# -*- coding: utf-8 -*-
'''
Created on 2017年3月20日

@author: ZhuJiahui
'''

import os
import time
from global_info.global_nlp import GlobalNLP
from nlp_utils.word_segment import get_stopwords, en_filtered_word_segment
from file_utils.file_writer import quick_write_1d_to_text

def word_segment_test(read_filename, write_filename):
    
    stopwords_list = get_stopwords()
    separater_string = GlobalNLP.EN_COLUMN_DELIMITER
    to_write = []
    with open(read_filename, 'r') as f:
        for each_line in f:
            try:
                line_text = each_line.strip().split(separater_string)[3]
            except :
                line_text = GlobalNLP.NULL_STR
            line_segment = en_filtered_word_segment(line_text, stopwords_list)
            to_write.append(" ".join(line_segment))
        
        quick_write_1d_to_text(write_filename, to_write)

if __name__ == '__main__':
    
    start = time.clock()    
    now_directory = os.getcwd()
    root_directory = os.path.dirname(now_directory) + '/'
    read_filename1 = root_directory + u'dataset/20newsgroup/mixture_train_text.txt'
    write_filename1 = root_directory + u'dataset/20newsgroup/segment_pos_train.txt'
    read_filename2 = root_directory + u'dataset/20newsgroup/mixture_test_text.txt'
    write_filename2 = root_directory + u'dataset/20newsgroup/segment_pos_test.txt'
    
    word_segment_test(read_filename1, write_filename1)
    word_segment_test(read_filename2, write_filename2)
        
    
    print('Total time %f seconds' % (time.clock() - start))
    
            